#####
# Crossing Borders: cleaning municipal population data
#####

library(here)
library(data.table)
library(readxl)

#####
# 1.Read in the overall population data. We need this information to create the weighted-average of travel time for the post-2012 merged municipalities

sheet_names <- excel_sheets(here("data", "cleaning", "MuniPop_1991_2018", "foreigners.xlsx"))

muniPop_f <- rbindlist(lapply(sheet_names, function(sheet_year){
  temp <- suppressMessages(read_xlsx(here("data", "cleaning", "MuniPop_1991_2018", "foreigners.xlsx"), sheet = sheet_year, col_names = FALSE, progress = F)); setDT(temp)
  
  # identify the columns with information and the january and december population, then drop other columns, and give useful names. Need to add "Aeugst am Albis" at the end of this because some years have numbers and names saved in a separate column. This catches that so that we can adjust for it throughout.
  keep_cols <- temp[ , lapply(.SD, function(x) any(grepl("Bezirk|Januar|Dezember|Aeugst am Albis", x)))]
  set(temp, j = colnames(temp)[!keep_cols], value = NULL)
  if(ncol(temp) == 3) setnames(temp, c("mun", "popJan", "popDec")) else setnames(temp, c("mun", "munName", "popJan", "popDec"))
  
  # drop the header rows (which have missing values for mun or pop [first row only]) and trailing rows (missing values for pop variables)
  temp <- na.omit(temp)
  
  # add a column with the year
  set(temp, j = "year", value = as.integer(sheet_year))
  
  # clean out things that are not municipalities
  # all.equal(temp[!grepl(">>|- |Schweiz|Ohne Angabe", mun), mun], temp[grepl("......", mun, fixed = T), mun])
  # temp <- temp[grepl("......", mun, fixed = T) ]
  temp <- temp[grepl("[[:digit:]]", mun) & !grepl("Schweiz", mun)] # drops everything that includes no numbers (like Bezirk names) or that says "Swiss"
  
  # now separate the names and codes
  set(temp, j = "mun", value = gsub("......", "", temp$mun, fixed = T))
  set(temp, j = "bfs", value = as.numeric(gsub("([0-9]+).*$", "\\1", temp$mun))) # gets only numbers
  
  # some of them have muni names in a separate column. In this case, no need to add the name variable
  if(!"munName" %in% colnames(temp)) set(temp, j = "munName", value = gsub("[[:digit:]]", "", temp$mun))
  
  # clean up the original column with numbers (and sometimes names)
  set(temp, j = "mun", value = NULL)
  
  # get the right classes
  for(j in c("popJan", "popDec")) set(temp, j=j, value = as.integer(temp[[j]]))
  
  return(temp)
}), use.names = TRUE)

muniPop_s <- rbindlist(lapply(sheet_names, function(sheet_year){
  temp <- suppressMessages(read_xlsx(here("data", "cleaning", "MuniPop_1991_2018", "swiss.xlsx"), sheet = sheet_year, col_names = FALSE, progress = F)); setDT(temp)
  
  # identify the columns with information and the january and december population, then drop other columns, and give useful names. Need to add "Aeugst am Albis" at the end of this because some years have numbers and names saved in a separate column. This catches that so that we can adjust for it throughout.
  keep_cols <- temp[ , lapply(.SD, function(x) any(grepl("Bezirk|Januar|Dezember|Aeugst am Albis", x)))]
  set(temp, j = colnames(temp)[!keep_cols], value = NULL)
  if(ncol(temp) == 3) setnames(temp, c("mun", "popJan", "popDec")) else setnames(temp, c("mun", "munName", "popJan", "popDec"))
  
  # drop the header rows (which have missing values for mun or pop [first row only]) and trailing rows (missing values for pop variables)
  temp <- na.omit(temp)
  
  # add a column with the year
  set(temp, j = "year", value = as.integer(sheet_year))
  
  # clean out things that are not municipalities
  # all.equal(temp[!grepl(">>|- |Schweiz|Ohne Angabe", mun), mun], temp[grepl("......", mun, fixed = T), mun])
  # temp <- temp[grepl("......", mun, fixed = T) ]
  temp <- temp[grepl("[[:digit:]]", mun) & !grepl("Schweiz", mun)] # drops everything that includes no numbers (like Bezirk names) or that says "Swiss"
  
  # now separate the names and codes
  set(temp, j = "mun", value = gsub("......", "", temp$mun, fixed = T))
  set(temp, j = "bfs", value = as.numeric(gsub("([0-9]+).*$", "\\1", temp$mun))) # gets only numbers
  
  # some of them have muni names in a separate column. In this case, no need to add the name variable
  if(!"munName" %in% colnames(temp)) set(temp, j = "munName", value = gsub("[[:digit:]]", "", temp$mun))
  
  # clean up the original column with numbers (and sometimes names)
  set(temp, j = "mun", value = NULL)
  
  # get the right classes
  for(j in c("popJan", "popDec")) set(temp, j=j, value = as.integer(temp[[j]]))
  
  return(temp)
}), use.names = TRUE)

muniPop_t <- rbindlist(lapply(sheet_names, function(sheet_year){
  temp <- suppressMessages(read_xlsx(here("data", "cleaning", "MuniPop_1991_2018", "all.xlsx"), sheet = sheet_year, col_names = FALSE, progress = F)); setDT(temp)
  
  # identify the columns with information and the january and december population, then drop other columns, and give useful names. Need to add "Aeugst am Albis" at the end of this because some years have numbers and names saved in a separate column. This catches that so that we can adjust for it throughout.
  keep_cols <- temp[ , lapply(.SD, function(x) any(grepl("Bezirk|Januar|Dezember|Aeugst am Albis", x)))]
  set(temp, j = colnames(temp)[!keep_cols], value = NULL)
  if(ncol(temp) == 3) setnames(temp, c("mun", "popJan", "popDec")) else setnames(temp, c("mun", "munName", "popJan", "popDec"))
  
  # drop the header rows (which have missing values for mun or pop [first row only]) and trailing rows (missing values for pop variables)
  temp <- na.omit(temp)
  
  # add a column with the year
  set(temp, j = "year", value = as.integer(sheet_year))
  
  # clean out things that are not municipalities
  # all.equal(temp[!grepl(">>|- |Schweiz|Ohne Angabe", mun), mun], temp[grepl("......", mun, fixed = T), mun])
  # temp <- temp[grepl("......", mun, fixed = T) ]
  temp <- temp[grepl("[[:digit:]]", mun) & !grepl("Schweiz", mun)] # drops everything that includes no numbers (like Bezirk names) or that says "Swiss"
  
  # now separate the names and codes
  set(temp, j = "mun", value = gsub("......", "", temp$mun, fixed = T))
  set(temp, j = "bfs", value = as.numeric(gsub("([0-9]+).*$", "\\1", temp$mun))) # gets only numbers
  
  # some of them have muni names in a separate column. In this case, no need to add the name variable
  if(!"munName" %in% colnames(temp)) set(temp, j = "munName", value = gsub("[[:digit:]]", "", temp$mun))
  
  # clean up the original column with numbers (and sometimes names)
  set(temp, j = "mun", value = NULL)
  
  # get the right classes
  for(j in c("popJan", "popDec")) set(temp, j=j, value = as.integer(temp[[j]]))
  
  return(temp)
}), use.names = TRUE)

rm(sheet_names)

#####
# 2. Fix some unusual cases!
# the below code could probably be merged into the rbindlist() calls above
# 2.a Neukirchen an der thur (bfs4505) was dissolved and merged away on 1996-01-01. Primarily joined Kradolf-Schönenberg (bfs 4501)
new <- muniPop_t[bfs==4501][muniPop_t[bfs==4505], on = "year", .(year, bfs, munName, popJan = popJan + i.popJan, popDec = popDec + i.popDec)]
muniPop_t <- muniPop_t[! bfs %in% c(4505)] # drop neukirchen an der thur
muniPop_t[new, on = c("year", "bfs"), `:=`(popJan = i.popJan, popDec = i.popDec)]; rm(new)

new <- muniPop_f[bfs==4501][muniPop_f[bfs==4505], on = "year", .(year, bfs, munName, popJan = popJan + i.popJan, popDec = popDec + i.popDec)]
muniPop_f <- muniPop_f[! bfs %in% c(4505)] # drop neukirchen an der thur
muniPop_f[new, on = c("year", "bfs"), `:=`(popJan = i.popJan, popDec = i.popDec)]; rm(new)

new <- muniPop_s[bfs==4501][muniPop_s[bfs==4505], on = "year", .(year, bfs, munName, popJan = popJan + i.popJan, popDec = popDec + i.popDec)]
muniPop_s <- muniPop_s[! bfs %in% c(4505)] # drop neukirchen an der thur
muniPop_s[new, on = c("year", "bfs"), `:=`(popJan = i.popJan, popDec = i.popDec)]; rm(new)

# 2.b Illighausen (bfs4670). This merged mostly into Lengwil (bfs4683) on 1998-01-01, but part of it also joined other munis. 
new <- muniPop_t[bfs==4683][muniPop_t[bfs==4670], on = "year", .(year, bfs, munName, popJan = popJan + i.popJan, popDec = popDec + i.popDec)]
muniPop_t <- muniPop_t[! bfs %in% c(4670)]
muniPop_t[new, on = c("year", "bfs"), `:=`(popJan = i.popJan, popDec = i.popDec)]; rm(new)

new <- muniPop_f[bfs==4683][muniPop_f[bfs==4670], on = "year", .(year, bfs, munName, popJan = popJan + i.popJan, popDec = popDec + i.popDec)]
muniPop_f <- muniPop_f[! bfs %in% c(4670)]
muniPop_f[new, on = c("year", "bfs"), `:=`(popJan = i.popJan, popDec = i.popDec)]; rm(new)

new <- muniPop_s[bfs==4683][muniPop_s[bfs==4670], on = "year", .(year, bfs, munName, popJan = popJan + i.popJan, popDec = popDec + i.popDec)]
muniPop_s <- muniPop_s[! bfs %in% c(4670)]
muniPop_s[new, on = c("year", "bfs"), `:=`(popJan = i.popJan, popDec = i.popDec)]; rm(new)

# 2.c Scherzingen (bfs4695). This split into two pieces on 1994-01-01: part became Bottighofen and part became Münsterlingen.
# approach: take the combined population of Bottighofen (bfs4643) and Munsterlingen (bfs4691) on Jan 1 1994. Use the ratio between the two to split up the population from Scherzingen going backward.

# total
ratio <- muniPop_t[year == 1994 & bfs %in% c(4691, 4643), .(bfs, munName, ratio_old = popJan/(sum(popJan)))]

new <- rbindlist(
  list(
    muniPop_t[bfs == 4695 & year < 1994, .(year, popJan = round(popJan * ratio[bfs==4643, ratio_old], 0), popDec = round( popDec * ratio[bfs==4643, ratio_old], 0), bfs = ratio[bfs==4643, bfs], munName = ratio[bfs==4643, munName])],
    muniPop_t[bfs == 4695 & year < 1994, .(year, popJan = round(popJan * ratio[bfs==4691, ratio_old], 0), popDec = round( popDec * ratio[bfs==4691, ratio_old], 0), bfs = ratio[bfs==4691, bfs], munName = ratio[bfs==4691, munName])]
  ), use.names = T
)

# overwrite with the new values
muniPop_t[new, on = c("year", "bfs"), `:=`(popJan = i.popJan, popDec = i.popDec)]; rm(new)

# Drop Scherzingen altogether
muniPop_t <- muniPop_t[bfs != 4695]

# foreign
# ratio <- muniPop_f[year == 1994 & bfs %in% c(4691, 4643), .(bfs, munName, ratio_old = popJan/(sum(popJan)))]

new <- rbindlist(
  list(
    muniPop_f[bfs == 4695 & year < 1994, .(year, popJan = round(popJan * ratio[bfs==4643, ratio_old], 0), popDec = round( popDec * ratio[bfs==4643, ratio_old], 0), bfs = ratio[bfs==4643, bfs], munName = ratio[bfs==4643, munName])],
    muniPop_f[bfs == 4695 & year < 1994, .(year, popJan = round(popJan * ratio[bfs==4691, ratio_old], 0), popDec = round( popDec * ratio[bfs==4691, ratio_old], 0), bfs = ratio[bfs==4691, bfs], munName = ratio[bfs==4691, munName])]
  ), use.names = T
)

# overwrite with the new values
muniPop_f[new, on = c("year", "bfs"), `:=`(popJan = i.popJan, popDec = i.popDec)]; rm(new)

# Drop Scherzingen altogether
muniPop_f <- muniPop_f[bfs != 4695]

# swiss
# ratio <- muniPop_s[year == 1994 & bfs %in% c(4691, 4643), .(bfs, munName, ratio_old = popJan/(sum(popJan)))]

new <- rbindlist(
  list(
    muniPop_s[bfs == 4695 & year < 1994, .(year, popJan = round(popJan * ratio[bfs==4643, ratio_old], 0), popDec = round( popDec * ratio[bfs==4643, ratio_old], 0), bfs = ratio[bfs==4643, bfs], munName = ratio[bfs==4643, munName])],
    muniPop_s[bfs == 4695 & year < 1994, .(year, popJan = round(popJan * ratio[bfs==4691, ratio_old], 0), popDec = round( popDec * ratio[bfs==4691, ratio_old], 0), bfs = ratio[bfs==4691, bfs], munName = ratio[bfs==4691, munName])]
  ), use.names = T
)

# overwrite with the new values
muniPop_s[new, on = c("year", "bfs"), `:=`(popJan = i.popJan, popDec = i.popDec)]; rm(new, ratio)

# Drop Scherzingen altogether
muniPop_s <- muniPop_s[bfs != 4695]

# 2.d. Allmendingen (630), Trimstein (631), and Rubigen (623. All three were together part of Rubigen prior to 1993-01-01. Then they separated. The data thus suggests population = 0 for allmendingen and Trimstein in 1991 and 1992. This can be fixed by figuring out the proportion of the total population that they each made of the old combined municipality, as follows:

ratio <- muniPop_t[year == 1993 & bfs %in% c(630, 631, 623), .(bfs, munName, ratio = popJan/sum(popJan))]
new_values <- muniPop_t[year < 1993 & bfs == 623, .(popJan = round(popJan*ratio$ratio, 0), popDec = round(popDec * ratio$ratio, 0), bfs = unique(ratio$bfs)), by = year]
muniPop_t[new_values, on = .(year, bfs), `:=`(popJan = i.popJan, popDec = i.popDec)]; rm(new_values)

new_values <- muniPop_f[year < 1993 & bfs == 623, .(popJan = round(popJan*ratio$ratio, 0), popDec = round(popDec * ratio$ratio, 0), bfs = unique(ratio$bfs)), by = year]
muniPop_f[new_values, on = .(year, bfs), `:=`(popJan = i.popJan, popDec = i.popDec)]; rm(new_values)

new_values <- muniPop_s[year < 1993 & bfs == 623, .(popJan = round(popJan*ratio$ratio, 0), popDec = round(popDec * ratio$ratio, 0), bfs = unique(ratio$bfs)), by = year]
muniPop_s[new_values, on = .(year, bfs), `:=`(popJan = i.popJan, popDec = i.popDec)]; rm(new_values)

# 2.e Basadingen (4531) split up on 1999-01-01 to form two new places: Schlatt (TG) (4546) and Basadingen-Schlattingen (4536). Using the data from 1999-01-01, we can figure out the rough share of the earlier population (which is recorded only for Basadingen-Schlattingen) eventually belonged to Schlatt (TG).

ratio <- muniPop_t[year == 1999 & bfs %in% c(4536, 4546), .(bfs, munName, ratio = popJan/sum(popJan))]
new_values <- muniPop_t[year < 1999 & bfs == 4536, .(popJan = round(popJan * ratio$ratio, 0), popDec = round(popDec * ratio$ratio, 0), bfs = unique(ratio$bfs)), by = year]
muniPop_t[new_values, on = .(year, bfs), `:=`(popJan = i.popJan, popDec = i.popDec)]; rm(new_values)

new_values <- muniPop_f[year < 1999 & bfs == 4536, .(popJan = round(popJan * ratio$ratio, 0), popDec = round(popDec * ratio$ratio, 0), bfs = unique(ratio$bfs)), by = year]
muniPop_f[new_values, on = .(year, bfs), `:=`(popJan = i.popJan, popDec = i.popDec)]; rm(new_values)

new_values <- muniPop_s[year < 1999 & bfs == 4536, .(popJan = round(popJan * ratio$ratio, 0), popDec = round(popDec * ratio$ratio, 0), bfs = unique(ratio$bfs)), by = year]
muniPop_s[new_values, on = .(year, bfs), `:=`(popJan = i.popJan, popDec = i.popDec)]; rm(new_values, ratio)

# 2.f The municipalities Pfyn (4841) and Uesslingen-Buch (4616) had a complicated merger on 1995-01-01. Uesslingen split between Uesslingen and Warth. Uesslingen joined Buch to form Uesslingen-Buch. Warth joined Weiningen, which separated from Pfyn, to form Warth-Weiningen (4621). This means that Uesslingen-Buch + Warth-Weiningen + Pfyn on 1995-01-01 give us the total population of Pfyn, Uesslingen, and Buch on 1994-12-31. 

# post-mutation total pop equals pre-mutation total. Good.
# muniPop_t[year==1995 & bfs %in% c(4841, 4616, 4621), sum(popJan)]
# muniPop_t[year==1994 & bfs %in% c(4841, 4616), sum(popDec)]

ratio_pfyn <- muniPop_t[year %in% c(1994:1995) & bfs == 4841, popJan[year==1995]/popDec[year==1994]]
ratio_uess <- muniPop_t[year %in% c(1994:1995) & bfs == 4616, popJan[year==1995]/popDec[year==1994]]
# ratio_ww_from_pfyn <- 1 - ratio_pfyn
# ratio_ww_from_uess <- 1 - ratio_uess

# total pop
new_values_pfyn <- muniPop_t[year < 1995 & bfs == 4841, .(bfs, year, round(popJan * ratio_pfyn, 0), round(popDec * ratio_pfyn, 0))]
new_values_uess <- muniPop_t[year < 1995 & bfs == 4616, .(bfs, year, round(popJan * ratio_uess, 0), round(popDec * ratio_uess, 0))]
new_values_pfyn_and_uess <- new_values_pfyn[new_values_uess, on = "year"][ , .(year, popJan_pu = V3 + i.V3, popDec_pu = V4 + i.V4)]
new_values_ww <- muniPop_t[year < 1995 & bfs %in% c(4841, 4616), .(popJan = sum(popJan), popDec = sum(popDec)), by = year][new_values_pfyn_and_uess, on = "year", .(bfs = 4621, year, popJan = popJan - popJan_pu, popDec = popDec - popDec_pu)]

muniPop_t[new_values_pfyn, on = .(year, bfs), `:=`(popJan = V3, popDec = V4)]; rm(new_values_pfyn)
muniPop_t[new_values_uess, on = .(year, bfs), `:=`(popJan = V3, popDec = V4)]; rm(new_values_uess)
muniPop_t[new_values_ww, on = .(year, bfs), `:=`(popJan = i.popJan, popDec = i.popDec)]; rm(new_values_ww, new_values_pfyn_and_uess)

# foreign
new_values_pfyn <- muniPop_f[year < 1995 & bfs == 4841, .(bfs, year, round(popJan * ratio_pfyn, 0), round(popDec * ratio_pfyn, 0))]
new_values_uess <- muniPop_f[year < 1995 & bfs == 4616, .(bfs, year, round(popJan * ratio_uess, 0), round(popDec * ratio_uess, 0))]
new_values_pfyn_and_uess <- new_values_pfyn[new_values_uess, on = "year"][ , .(year, popJan_pu = V3 + i.V3, popDec_pu = V4 + i.V4)]
new_values_ww <- muniPop_f[year < 1995 & bfs %in% c(4841, 4616), .(popJan = sum(popJan), popDec = sum(popDec)), by = year][new_values_pfyn_and_uess, on = "year", .(bfs = 4621, year, popJan = popJan - popJan_pu, popDec = popDec - popDec_pu)]

muniPop_f[new_values_pfyn, on = .(year, bfs), `:=`(popJan = V3, popDec = V4)]; rm(new_values_pfyn)
muniPop_f[new_values_uess, on = .(year, bfs), `:=`(popJan = V3, popDec = V4)]; rm(new_values_uess)
muniPop_f[new_values_ww, on = .(year, bfs), `:=`(popJan = i.popJan, popDec = i.popDec)]; rm(new_values_ww, new_values_pfyn_and_uess)

# swiss
new_values_pfyn <- muniPop_s[year < 1995 & bfs == 4841, .(bfs, year, round(popJan * ratio_pfyn, 0), round(popDec * ratio_pfyn, 0))]
new_values_uess <- muniPop_s[year < 1995 & bfs == 4616, .(bfs, year, round(popJan * ratio_uess, 0), round(popDec * ratio_uess, 0))]
new_values_pfyn_and_uess <- new_values_pfyn[new_values_uess, on = "year"][ , .(year, popJan_pu = V3 + i.V3, popDec_pu = V4 + i.V4)]
new_values_ww <- muniPop_s[year < 1995 & bfs %in% c(4841, 4616), .(popJan = sum(popJan), popDec = sum(popDec)), by = year][new_values_pfyn_and_uess, on = "year", .(bfs = 4621, year, popJan = popJan - popJan_pu, popDec = popDec - popDec_pu)]

muniPop_s[new_values_pfyn, on = .(year, bfs), `:=`(popJan = V3, popDec = V4)]; rm(new_values_pfyn)
muniPop_s[new_values_uess, on = .(year, bfs), `:=`(popJan = V3, popDec = V4)]; rm(new_values_uess)
muniPop_s[new_values_ww, on = .(year, bfs), `:=`(popJan = i.popJan, popDec = i.popDec)]; rm(new_values_ww, new_values_pfyn_and_uess, ratio_pfyn, ratio_uess)

# 2.g Lommis (4741) did a double split on 1995-01-01. Bettwiesen (4716) separated and another part joined Thundorf (4611).

# population totals add up correctly
# muniPop_t[year==1995 & bfs %in% c(4741, 4716, 4611), sum(popJan)]
# muniPop_t[year==1994 & bfs %in% c(4741, 4611), sum(popDec)]

ratio_lommis <- muniPop_t[year %in% 1994:1995 & bfs == 4741, popJan[year==1995]/popDec[year==1994]]
ratio_bett <- muniPop_t[year %in% 1994:1995 & bfs %in% c(4716, 4741), popJan[year==1995 & bfs == 4716]/popDec[year==1994 & bfs == 4741]]
ratio_belonging_to_thundorf <- 1 - ratio_lommis - ratio_bett

# total
new_values_lommis <- muniPop_t[year < 1995 & bfs == 4741, .(bfs, year, popJan = round(popJan * ratio_lommis, 0), popDec = round(popDec * ratio_lommis, 0))]
new_values_bett <- muniPop_t[year < 1995 & bfs == 4741, .(bfs = 4716, year, popJan = round(popJan * ratio_bett, 0), popDec = round(popDec * ratio_bett, 0))]
new_values_thundorf <- muniPop_t[year < 1995 & bfs %in% c(4741, 4611), .(bfs = 4611, year = year[bfs==4611], popJan = popJan[bfs==4611] + round(popJan[bfs==4741] * ratio_belonging_to_thundorf, 0), popDec = popDec[bfs==4611] + round(popDec[bfs==4741] * ratio_belonging_to_thundorf, 0))]

muniPop_t[new_values_lommis, on = .(year, bfs), `:=`(popJan = i.popJan, popDec = i.popDec)]; rm(new_values_lommis)
muniPop_t[new_values_bett, on = .(year, bfs), `:=`(popJan = i.popJan, popDec = i.popDec)]; rm(new_values_bett)
muniPop_t[new_values_thundorf, on = .(year, bfs), `:=`(popJan = i.popJan, popDec = i.popDec)]; rm(new_values_thundorf)

# foreign
new_values_lommis <- muniPop_f[year < 1995 & bfs == 4741, .(bfs, year, popJan = round(popJan * ratio_lommis, 0), popDec = round(popDec * ratio_lommis, 0))]
new_values_bett <- muniPop_f[year < 1995 & bfs == 4741, .(bfs = 4716, year, popJan = round(popJan * ratio_bett, 0), popDec = round(popDec * ratio_bett, 0))]
new_values_thundorf <- muniPop_f[year < 1995 & bfs %in% c(4741, 4611), .(bfs = 4611, year = year[bfs==4611], popJan = popJan[bfs==4611] + round(popJan[bfs==4741] * ratio_belonging_to_thundorf, 0), popDec = popDec[bfs==4611] + round(popDec[bfs==4741] * ratio_belonging_to_thundorf, 0))]

muniPop_f[new_values_lommis, on = .(year, bfs), `:=`(popJan = i.popJan, popDec = i.popDec)]; rm(new_values_lommis)
muniPop_f[new_values_bett, on = .(year, bfs), `:=`(popJan = i.popJan, popDec = i.popDec)]; rm(new_values_bett)
muniPop_f[new_values_thundorf, on = .(year, bfs), `:=`(popJan = i.popJan, popDec = i.popDec)]; rm(new_values_thundorf)

# swiss
new_values_lommis <- muniPop_s[year < 1995 & bfs == 4741, .(bfs, year, popJan = round(popJan * ratio_lommis, 0), popDec = round(popDec * ratio_lommis, 0))]
new_values_bett <- muniPop_s[year < 1995 & bfs == 4741, .(bfs = 4716, year, popJan = round(popJan * ratio_bett, 0), popDec = round(popDec * ratio_bett, 0))]
new_values_thundorf <- muniPop_s[year < 1995 & bfs %in% c(4741, 4611), .(bfs = 4611, year = year[bfs==4611], popJan = popJan[bfs==4611] + round(popJan[bfs==4741] * ratio_belonging_to_thundorf, 0), popDec = popDec[bfs==4611] + round(popDec[bfs==4741] * ratio_belonging_to_thundorf, 0))]

muniPop_s[new_values_lommis, on = .(year, bfs), `:=`(popJan = i.popJan, popDec = i.popDec)]; rm(new_values_lommis)
muniPop_s[new_values_bett, on = .(year, bfs), `:=`(popJan = i.popJan, popDec = i.popDec)]; rm(new_values_bett)
muniPop_s[new_values_thundorf, on = .(year, bfs), `:=`(popJan = i.popJan, popDec = i.popDec)]; rm(new_values_thundorf, ratio_lommis, ratio_bett, ratio_belonging_to_thundorf)

# 2.h Tobel-Tägerschen (4776) saw Braunau (4723) break away in 1999

# populations add up as they should
# muniPop_t[year %in% 1998:1999 & bfs %in% c(4776, 4723), sum(popJan[year==1999]) == popDec[year==1998 & bfs == 4776]]

ratio_tobel <- muniPop_t[year %in% 1998:1999 & bfs %in% c(4776), popJan[year==1999]/popDec[year==1998]]

#total
new_values_tobel <- muniPop_t[year < 1999 & bfs == 4776, .(bfs, year, popJan = round(popJan * ratio_tobel, 0), popDec = round(popDec * ratio_tobel, 0))]
new_values_braun <- muniPop_t[year < 1999 & bfs == 4776, .(bfs = 4723, year, popJan = round(popJan * (1-ratio_tobel), 0), popDec = round(popDec * (1-ratio_tobel), 0))]

muniPop_t[new_values_tobel, on = .(bfs, year),`:=`(popJan = i.popJan, popDec = i.popDec)]; rm(new_values_tobel)
muniPop_t[new_values_braun, on = .(bfs, year),`:=`(popJan = i.popJan, popDec = i.popDec)]; rm(new_values_braun)

#foreign
new_values_tobel <- muniPop_f[year < 1999 & bfs == 4776, .(bfs, year, popJan = round(popJan * ratio_tobel, 0), popDec = round(popDec * ratio_tobel, 0))]
new_values_braun <- muniPop_f[year < 1999 & bfs == 4776, .(bfs = 4723, year, popJan = round(popJan * (1-ratio_tobel), 0), popDec = round(popDec * (1-ratio_tobel), 0))]

muniPop_f[new_values_tobel, on = .(bfs, year),`:=`(popJan = i.popJan, popDec = i.popDec)]; rm(new_values_tobel)
muniPop_f[new_values_braun, on = .(bfs, year),`:=`(popJan = i.popJan, popDec = i.popDec)]; rm(new_values_braun)

#swiss
new_values_tobel <- muniPop_s[year < 1999 & bfs == 4776, .(bfs, year, popJan = round(popJan * ratio_tobel, 0), popDec = round(popDec * ratio_tobel, 0))]
new_values_braun <- muniPop_s[year < 1999 & bfs == 4776, .(bfs = 4723, year, popJan = round(popJan * (1-ratio_tobel), 0), popDec = round(popDec * (1-ratio_tobel), 0))]

muniPop_s[new_values_tobel, on = .(bfs, year),`:=`(popJan = i.popJan, popDec = i.popDec)]; rm(new_values_tobel)
muniPop_s[new_values_braun, on = .(bfs, year),`:=`(popJan = i.popJan, popDec = i.popDec)]; rm(new_values_braun, ratio_tobel)

# 2.i On 1997-01-01 the new municipality of Eschlikon (4724) was created from parts of Sirnach (4761)

#populations add up as they should
# muniPop_t[year %in% 1996:1997 & bfs %in% c(4761, 4724), sum(popJan[year==1997]) == popDec[year==1996 & bfs == 4761]]

ratio_sirn <- muniPop_t[year %in% 1996:1997 & bfs %in% c(4761), popJan[year==1997]/popDec[year==1996]]

#total
new_values_sirn <- muniPop_t[year < 1997 & bfs == 4761, .(bfs, year, popJan = round(popJan * ratio_sirn, 0), popDec = round(popDec * ratio_sirn, 0))]
new_values_esch <- muniPop_t[year < 1997 & bfs == 4761, .(bfs = 4724, year, popJan = round(popJan * (1-ratio_sirn), 0), popDec = round(popDec * (1-ratio_sirn), 0))]

muniPop_t[new_values_sirn, on = .(bfs, year),`:=`(popJan = i.popJan, popDec = i.popDec)]; rm(new_values_sirn)
muniPop_t[new_values_esch, on = .(bfs, year),`:=`(popJan = i.popJan, popDec = i.popDec)]; rm(new_values_esch)

#foreign
new_values_sirn <- muniPop_f[year < 1997 & bfs == 4761, .(bfs, year, popJan = round(popJan * ratio_sirn, 0), popDec = round(popDec * ratio_sirn, 0))]
new_values_esch <- muniPop_f[year < 1997 & bfs == 4761, .(bfs = 4724, year, popJan = round(popJan * (1-ratio_sirn), 0), popDec = round(popDec * (1-ratio_sirn), 0))]

muniPop_f[new_values_sirn, on = .(bfs, year),`:=`(popJan = i.popJan, popDec = i.popDec)]; rm(new_values_sirn)
muniPop_f[new_values_esch, on = .(bfs, year),`:=`(popJan = i.popJan, popDec = i.popDec)]; rm(new_values_esch)

#swiss
new_values_sirn <- muniPop_s[year < 1997 & bfs == 4761, .(bfs, year, popJan = round(popJan * ratio_sirn, 0), popDec = round(popDec * ratio_sirn, 0))]
new_values_esch <- muniPop_s[year < 1997 & bfs == 4761, .(bfs = 4724, year, popJan = round(popJan * (1-ratio_sirn), 0), popDec = round(popDec * (1-ratio_sirn), 0))]

muniPop_s[new_values_sirn, on = .(bfs, year),`:=`(popJan = i.popJan, popDec = i.popDec)]; rm(new_values_sirn)
muniPop_s[new_values_esch, on = .(bfs, year),`:=`(popJan = i.popJan, popDec = i.popDec)]; rm(new_values_esch, ratio_sirn)

# 2.j On 1998-01-01 the new municipality of Wilen (TG) (4786) was created from parts of Rickenbach (TG) (4751).

#populations add up as they should
# muniPop_t[year %in% 1997:1998 & bfs %in% c(4751, 4786), sum(popJan[year==1998]) == popDec[year==1997 & bfs == 4751]]

ratio_rick <- muniPop_t[year %in% 1997:1998 & bfs %in% c(4751), popJan[year==1998]/popDec[year==1997]]

#total
new_values_rick <- muniPop_t[year < 1998 & bfs == 4751, .(bfs, year, popJan = round(popJan * ratio_rick, 0), popDec = round(popDec * ratio_rick, 0))]
new_values_wile <- muniPop_t[year < 1998 & bfs == 4751, .(bfs = 4786, year, popJan = round(popJan * (1-ratio_rick), 0), popDec = round(popDec * (1-ratio_rick), 0))]

muniPop_t[new_values_rick, on = .(bfs, year),`:=`(popJan = i.popJan, popDec = i.popDec)]; rm(new_values_rick)
muniPop_t[new_values_wile, on = .(bfs, year),`:=`(popJan = i.popJan, popDec = i.popDec)]; rm(new_values_wile)

#foreign
new_values_rick <- muniPop_f[year < 1998 & bfs == 4751, .(bfs, year, popJan = round(popJan * ratio_rick, 0), popDec = round(popDec * ratio_rick, 0))]
new_values_wile <- muniPop_f[year < 1998 & bfs == 4751, .(bfs = 4786, year, popJan = round(popJan * (1-ratio_rick), 0), popDec = round(popDec * (1-ratio_rick), 0))]

muniPop_f[new_values_rick, on = .(bfs, year),`:=`(popJan = i.popJan, popDec = i.popDec)]; rm(new_values_rick)
muniPop_f[new_values_wile, on = .(bfs, year),`:=`(popJan = i.popJan, popDec = i.popDec)]; rm(new_values_wile)

#swiss
new_values_rick <- muniPop_s[year < 1998 & bfs == 4751, .(bfs, year, popJan = round(popJan * ratio_rick, 0), popDec = round(popDec * ratio_rick, 0))]
new_values_wile <- muniPop_s[year < 1998 & bfs == 4751, .(bfs = 4786, year, popJan = round(popJan * (1-ratio_rick), 0), popDec = round(popDec * (1-ratio_rick), 0))]

muniPop_s[new_values_rick, on = .(bfs, year),`:=`(popJan = i.popJan, popDec = i.popDec)]; rm(new_values_rick)
muniPop_s[new_values_wile, on = .(bfs, year),`:=`(popJan = i.popJan, popDec = i.popDec)]; rm(new_values_wile, ratio_rick)

# 2.k This is a double-whammy. Mammern (4826) separated from Steckborn (4864) on 1993-01-01. Then on 1998-01-01, part of Steckborn joined Homburg (4816). Ignore the second part for now.

#populations add up as they should
# muniPop_t[year %in% 1992:1993 & bfs %in% c(4864, 4826), sum(popJan[year==1993]) == popDec[year==1992 & bfs == 4864]]

ratio_steck <- muniPop_t[year %in% 1992:1993 & bfs %in% c(4864), popJan[year==1993]/popDec[year==1992]]

#total
new_values_steck <- muniPop_t[year < 1993 & bfs == 4864, .(bfs, year, popJan = round(popJan * ratio_steck, 0), popDec = round(popDec * ratio_steck, 0))]
new_values_mamm <- muniPop_t[year < 1993 & bfs == 4864, .(bfs = 4826, year, popJan = round(popJan * (1-ratio_steck), 0), popDec = round(popDec * (1-ratio_steck), 0))]

muniPop_t[new_values_steck, on = .(bfs, year),`:=`(popJan = i.popJan, popDec = i.popDec)]; rm(new_values_steck)
muniPop_t[new_values_mamm, on = .(bfs, year),`:=`(popJan = i.popJan, popDec = i.popDec)]; rm(new_values_mamm)

#foreign
new_values_steck <- muniPop_f[year < 1993 & bfs == 4864, .(bfs, year, popJan = round(popJan * ratio_steck, 0), popDec = round(popDec * ratio_steck, 0))]
new_values_mamm <- muniPop_f[year < 1993 & bfs == 4864, .(bfs = 4826, year, popJan = round(popJan * (1-ratio_steck), 0), popDec = round(popDec * (1-ratio_steck), 0))]

muniPop_f[new_values_steck, on = .(bfs, year),`:=`(popJan = i.popJan, popDec = i.popDec)]; rm(new_values_steck)
muniPop_f[new_values_mamm, on = .(bfs, year),`:=`(popJan = i.popJan, popDec = i.popDec)]; rm(new_values_mamm)

#swiss
new_values_steck <- muniPop_s[year < 1993 & bfs == 4864, .(bfs, year, popJan = round(popJan * ratio_steck, 0), popDec = round(popDec * ratio_steck, 0))]
new_values_mamm <- muniPop_s[year < 1993 & bfs == 4864, .(bfs = 4826, year, popJan = round(popJan * (1-ratio_steck), 0), popDec = round(popDec * (1-ratio_steck), 0))]

muniPop_s[new_values_steck, on = .(bfs, year),`:=`(popJan = i.popJan, popDec = i.popDec)]; rm(new_values_steck)
muniPop_s[new_values_mamm, on = .(bfs, year),`:=`(popJan = i.popJan, popDec = i.popDec)]; rm(new_values_mamm, ratio_steck)

# End removing three unusual cases
#####

#####
# 3. Do some checks on this data

# use the up-to-date municipality merger crosswalk, courtesy of MM and DW
cw <- fread(here("data", "cleaning", "MuniCrossWalk_MM", "cw2019_dec31mergers.csv"))

# looks like the have the same overlapping municipalities from 2006 through 2018
# muniPop_t[, .N, year][muniPop_s[,.N,year], on = "year"][muniPop_f[,.N,year], on = "year"][, .(year, N_all = N, N_ch = i.N, N_aus = i.N.1)][cw[,.N,year], on = "year"][year %between% c(1991, 2018)][ , .(year, N_all, N_ch, N_aus, N_cw = N)]

# for(Y in 2018:2006){
# print(setdiff_dw(muniPop_t[year==Y, bfs], muniPop_s[year==Y, bfs]))
# print(setdiff_dw(muniPop_f[year==Y, bfs], muniPop_s[year==Y, bfs]))
# print(setdiff_dw(muniPop_t[year==Y, bfs], muniPop_f[year==Y, bfs]))
# }

# make sure that totals sum up right ofr the 2006-2018 period where N muni overlap correctly.
# muniPop_t[year %between% c(2006, 2018)][muniPop_s[year %between% c(2006, 2018)], on = c("bfs", "year")][muniPop_f[year %between% c(2006, 2018)], on = c("bfs", "year")][ , .(popJan == (i.popJan + i.popJan.1), popDec == (i.popDec + i.popDec.1)), by = .(bfs, year)][ , table(V1, V2)] # the numbers add up perfectly for 2006 to 2018! HOORAY
# muniPop_t[year %between% c(2006, 2018), .N] # and the number of cases matches the output of the above. Nothing wrong here!

# make sure that totals sum up right for the 1991-1999 period when the N muni overlap
# muniPop_t[year %between% c(1991, 1999)][muniPop_s[year %between% c(1991, 1999)], on = c("bfs", "year")][muniPop_f[year %between% c(1991, 1999)], on = c("bfs", "year")][ , .(popJan == (i.popJan + i.popJan.1), popDec == (i.popDec + i.popDec.1)), by = .(bfs, year)][ , table(V1, V2)] # the numbers add up perfectly for 1991 to 1999 save for the special case mentioned above (bfs4691 & bfs4643! HOORAY
# muniPop_t[year %between% c(1991, 1999), .N] # and the number of cases matches the output of the above. Nothing wrong here!

# Note: there is no direct way to check the 2000 to 2005 period because the data use different "municipality vintages."
# Here we have the total data with 2,726 municipalities per year, i.e., the vintage 2006 municipalities! The Swiss data has v2006 municipalities for 2005, but for 2000-2004 it has the v2005 municipalities. The Foreigner data has v2005 munis for 2000-01 and 03-05 and v2006 for 02. 


#####
# 4. Create a constant panel of v2019 municipalities

### A. create an identifier for the 'vintage' of municipalities to be used.
total_vintages <- muniPop_t[ , .N, year][cw[ , .N, year], on = "N", nomatch = NULL, .(year, m_year = i.year)]
muniPop_t[ total_vintages, on = "year", m_year := m_year ]; rm(total_vintages)

foreign_vintages <- muniPop_f[ , .N, year][cw[ , .N, year], on = "N", nomatch = NULL, .(year, m_year = i.year)]
muniPop_f[ foreign_vintages, on = "year", m_year := m_year ]; rm(foreign_vintages)

swiss_vintages <- muniPop_s[ , .N, year][cw[ , .N, year], on = "N", nomatch = NULL, .(year, m_year = i.year)]
muniPop_s[ swiss_vintages, on = "year", m_year := m_year ]; rm(swiss_vintages)

### B. Add in the vintage 2019 bfs numbers
muniPop_t[ cw, on = c(m_year = "year", bfs = "bfs"), bfs19 := bfs19]
muniPop_t_bp <- muniPop_t[ , .(popJanTotal = sum(popJan), popDecTotal = sum(popDec)), keyby = .(year, bfs19)]

muniPop_f[ cw, on = c(m_year = "year", bfs = "bfs"), bfs19 := bfs19]
muniPop_f_bp <- muniPop_f[ , .(popJanForiegn = sum(popJan), popDecForeign = sum(popDec)), keyby = .(year, bfs19)]

muniPop_s[ cw, on = c(m_year = "year", bfs = "bfs"), bfs19 := bfs19]
muniPop_s_bp <- muniPop_s[ , .(popJanSwiss = sum(popJan), popDecSwiss = sum(popDec)), keyby = .(year, bfs19)]

# dim(muniPop_t_bp)
# dim(muniPop_f_bp)
# dim(muniPop_s_bp)

muniPop <- muniPop_t_bp[muniPop_f_bp][muniPop_s_bp]; rm(muniPop_t_bp, muniPop_f_bp, muniPop_s_bp)

message(
  "You've produced the following datasets:
  * muniPop: total, swiss, and foreign populations for the 2019-vintage municipalities.
  * muniPop_t: total populations by year according to the municipality vintage provided by bfs. Contains links to the 2019-vintange.
  * muniPop_s and muniPop_f: same as above but for s-wiss and f-oreign.
  * cw: the c-ross w-alk linking historic bfs numbers to 2016 through 2019 bfs numbers."
  )

# some checks: 
# muniPop[popJanTotal==0][order(bfs19, -year)] # no more empty numbers!
# muniPop[ popJanTotal - (popJanForiegn + popJanSwiss) != 0] # only cases where the january populations don't add up correctly are the strange case of Scherzingen (those are only off due to rounding) 
# muniPop[, .(year, popJanTotal - c(NA, popDecTotal[-.N])), keyby = bfs19][V2 > 200][order(bfs19, year)] there are about 32 cases where the numbers from year-to-year change by more than 200. Not bad out of 62K total. Probably not worth chasing these down at this point.
